www.gusucode.com > C++ Primer 书籍源代码源码程序 > C++ Primer 书籍源代码源码程序/code/10/3ed_query.cpp

    //Download by http://www.NewXing.com
/*
 * This file contains code from "C++ Primer, Fourth Edition", by Stanley B.
 * Lippman, Jose Lajoie, and Barbara E. Moo, and is covered under the
 * copyright and warranty notices given in that book:
 * 
 * "Copyright (c) 2005 by Objectwrite, Inc., Jose Lajoie, and Barbara E. Moo."
 * 
 * 
 * "The authors and publisher have taken care in the preparation of this book,
 * but make no expressed or implied warranty of any kind and assume no
 * responsibility for errors or omissions. No liability is assumed for
 * incidental or consequential damages in connection with or arising out of the
 * use of the information or programs contained herein."
 * 
 * Permission is granted for this code to be used for educational purposes in
 * association with the book, given proper citation if and when posted or
 * reproduced.Any commercial use of this code requires the explicit written
 * permission of the publisher, Addison-Wesley Professional, a division of
 * Pearson Education, Inc. Send your request for permission, stating clearly
 * what code you would like to use, and in what specific way, to the following
 * address: 
 * 
 * 	Pearson Education, Inc.
 * 	Rights and Contracts Department
 * 	75 Arlington Street, Suite 300
 * 	Boston, MA 02216
 * 	Fax: (617) 848-7047
*/ 

#include "3ed_query.h"
#include <string>
#include <vector>
#include <map>
#include <set>
#include <iostream>
#include <fstream>
#include <cctype>
#include <cstring>

using std::set;
using std::string;
using std::getline;
using std::map;
using std::vector;
using std::cerr;
using std::cout;
using std::cin;
using std::ifstream;
using std::endl;
using std::pair;
using std::make_pair;
using std::ispunct;
using std::tolower;
using std::strlen;

// read input file: store each line as element in lines_of_text 
void TextQuery::store_file(ifstream &is)
{
    string textline;
    while (getline(is, textline))
       lines_of_text.push_back(textline);
}
set<string> TextQuery::exclusion_set = build_exclusion_set();

// \v: vertical tab; \f: formfeed; \r: carriage return are
// treated as whitespace characters along with space, tab and newline
string TextQuery::whitespace_chars(" \t\n\v\r\f");

// finds the whitespace separated words in the input vector
// then puts them in a cannonical form: eliminate suffixes,
// make lowercase etc. Finally, test if its an excluded word or not.
// If not excluded store the word in word_map along with
// its location (line # and character position) info
void TextQuery::build_map()
{
    // process each line from the input vector
    for (line_no line_num = 0; 
                 line_num != lines_of_text.size();
                 ++line_num)
    {
        string textline = lines_of_text[line_num];
        if (textline.empty()) continue; //ignore blank lines

        // make line lower case and remove extraneous punctuation
        strip_caps(textline);
        strip_punct(textline);

        /* 
         * Because we want to keep track of position as well as
         * line number, we have to process the line a character at a time.
         * We can't use an istringstream to read the words.
         * prev_pos will denote first character in each word,
         * pos will denote whitespace that separates the word from the next
         * initially pos denotes first non-whitespace at beginning of line
        */
        str_size pos = skip_whitespace(textline, 0), prev_pos = pos;

        // find each whitespace separated word
        while ((pos = textline.find_first_of(whitespace_chars, pos)) 
                   != string::npos)
        {
            // remove suffixes and put the word into the map if apporpriate
            test_insert(textline, prev_pos, pos, line_num);

            // if there's more text to process, increment pos to get next char
            if (pos != textline.size())
                ++pos;  
            // read and discard adjacent spaces, if any, updating prev_pos too
            pos = prev_pos = skip_whitespace(textline, pos);
        }

        // don't forget last word in the line
        if (pos != prev_pos)  // false if line ends in whitespace
            test_insert(textline, prev_pos, pos, line_num);
    }
}

TextQuery::str_size 
TextQuery::skip_whitespace(const string &line, str_size pos)
{
    // ignore adjacent whitespace
    str_size next = line.find_first_not_of(whitespace_chars, pos);
    if (next != string::npos)
        return next;
    else
        return line.size();
}

void TextQuery::test_insert(const string &line, str_size prev_pos, 
                  str_size pos, line_no line_num)
{
    // make copy of the whitespace delimited word
    string word(line.substr(prev_pos, pos - prev_pos));

    strip_suffixes(word); // last of the cleanup operations

    // if there's anything left after stripping punctuation 
    // and it's not an excluded word, add to the map
    // appending line num & char pos to vector for this word
    if (!word.empty() && !exclude_word(word))
         word_map[word].push_back(make_pair(line_num,prev_pos));

    return;
}

void TextQuery::strip_suffixes(string &word)
{
     if (word.size() <= 3)    // too short to have any suffixes
            return;

     if (word[word.size() - 1] == 's')  // only handle plurals so far
             suffix_s(word);

     // additional suffix handling goes here
}

void TextQuery::suffix_s(string &word)
{
    // some words ending in s aren't suffixes, they're part of the word
    static char* ok_endings[] = {"ous", "ius", "ss", "is"};
    size_t sz = sizeof(ok_endings)/sizeof(char*);  // how many elements?
    for (size_t i = 0; i != sz; ++i)
        if (chk_ending(word, ok_endings[i]) == 0)
              return;

    // replace common suffixes by their base word ending
    // repl_endings first dimension is the ending we'll remove
    //              second dimension is the new ending we'll insert
    static char* repl_endings[][2] = 
          { {"ies", "y"}, {"ses", "s"}, {"\'s", ""}, {"s", ""} };

    sz = sizeof(repl_endings)/(sizeof(char*) * 2);  // two-dimensions
    for (size_t i = 0; i != sz; ++i) 
        if (chk_ending(word, repl_endings[i][0]) == 0) {
           size_t sz = strlen(repl_endings[i][0]);
           word.replace(word.size() - sz, sz, repl_endings[i][1]);
           return;
        }
}

// compare end of the word with the ending we're given
int TextQuery::chk_ending(const string &word, const char *ending)
{
    size_t sz = strlen(ending);
    return word.compare(word.size() - sz, sz, ending);
}

void TextQuery::strip_caps(string &line)
{
    // not changing the size of line, so safe to cache the size
    str_size sz = line.size();
    for (str_size pos = 0; pos != sz; ++pos)
        line[pos] = tolower(line[pos]);
}

// except for apostrophe, replace punctuation by a space
// apostrophe is special: it might precede by 's, which is a suffix
void TextQuery::strip_punct(string &line)
{
    for (str_size pos = 0; pos != line.size(); ++pos)
        if (ispunct(line[pos])) {
             if (line[pos] != '\'') line[pos] = ' ';
        }
}

// this function should be changed to let the user specify a
// file of words to ignore, but for now, we'll keep it simple
// and assume a specific file
set<string> TextQuery::build_exclusion_set()
{
    set<string> ret;
    ifstream infile("exclusion_set");
    if (!infile)
    {
        static string default_excluded_words[] = {
          "the","and","but","that","then","are","been",
          "can","can't","cannot","could","did","for",
          "had","have","him","his","her","its","into",
          "were","which","when","with","would"
         };

        cerr << "warning! unable to open word exclusion file! -- "
            << "using default set" << endl;

        ret = set<string>(default_excluded_words, 
                          default_excluded_words + 
                          sizeof(default_excluded_words)/sizeof(string));
    } else {
        string word;
        while (infile >> word) ret.insert(word);
    }
    return ret;
}

bool TextQuery::exclude_word(const string &word)
{
    return (exclusion_set.find(word) != exclusion_set.end());
}

vector<TextQuery::location> TextQuery::run_query(const string &s)
{
    // make local copy so we can clean it up to match words
    // entered in the map; but when communicate back to the
    // user always use their original version
    string sought = s;  
    strip_caps(sought);
    strip_punct(sought);
    strip_suffixes(sought);

    // Note: must use find and not subscript the map directly
    // Subscripting a map adds the element if it's not already there
    // We want to know whether the element was there to begin with
    if (word_map.find(sought) == word_map.end())
         return vector<location>();  // not found, return empty location vector
    else
        // fetch list of locations for this word
        return word_map[sought];

}

void TextQuery::display_map()
{
    typedef map< string,vector<location> > map_text;
    map_text::iterator iter = word_map.begin(),
                   iter_end = word_map.end();

    // Note: map iter returns index, value pair
    // so iter->first is the index word, 
    // iter->second the vector of its locations

    // for each word in the map
    while (iter != iter_end) {
        cout << "word: " << iter->first << " {";

        vector<location> text_locs = iter->second;
        vector<location>::iterator liter = text_locs.begin(),
                               liter_end = text_locs.end();

        // print all (line,char) locations for this word
        // because this is a debugging routine, don't adjust
        // line/pos numbers.  As normal for programmers, start from 0
        while (liter != liter_end)
        {
            cout << "(" << liter->first
                 << "," << liter->second << ")";

            if (++liter != liter_end)
                 cout << ", ";

         }

         cout << "}\n";  // end list of output this word
         ++iter;         // get next word in the map
    }
    cout << endl;  // finished printing entire map
}